In [116]:
"""Global ESG FEATURES SAMPLE"""

'''SAMPLE SMALLER AS ESG DATA BEGINS END OF 2001

ESG SAMPLE - 
20,711 observations 
619 firms

14,592 from developed nations - 17 countries 
6,119 from developing nations - 19 countries

NON-ESG SAMPLE
28,035 observations
825 firms'''
Out[116]:
'SAMPLE SMALLER AS ESG DATA BEGINS END OF 2001\n\nESG SAMPLE - \n20,711 observations \n619 firms\n\n14,592 from developed nations - 17 countries \n6,119 from developing nations - 19 countries\n\nNON-ESG SAMPLE\n28,035 observations\n825 firms'
In [73]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('bmh')

os.chdir('/Users/lockiemichalski/Documents/Files_Mac_before_clean/Desktop/UQ/RQ3_2020/Credit_rating_WRDS/Steps_sample/Files to upload rand/Global_data/Draft2_data')
In [74]:
data=pd.read_csv('GLOBAL_SAMPLE_ESG_ONLY.csv') # data
data = data[data['loc']!='US']
/Users/lockiemichalski/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3071: DtypeWarning: Columns (186,188,190,192,193,196,199,202,219,232,243,244,245,257,258,267,283,301,305,306,307,309,310,318,322,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535) have mixed types.Specify dtype option on import or set low_memory=False.
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
In [75]:
data.sort_values(by='rating_rank', inplace=True) # sort S&P rating feature
In [76]:
"WITHOUT FINANCIALS DISTRIBUTION"

plt.figure(figsize = (20, 10), dpi = 200)
sns.countplot(x='Issuer Rating', data=data).set_title('All-classes')
# class C is not in graph as there are only 2 observations and for classifiers to run this has to removed. 
Out[76]:
Text(0.5, 1.0, 'All-classes')
In [126]:
'''average % of non-missing values for countries in sample'''

#start from col 22

non_missing_df = pd.DataFrame() # empty df
non_missing_df[list(data.iloc[:,26:].columns)]=data.groupby('loc')[list(data.iloc[:,26:].columns)]\
.apply(lambda x: x.notnull().sum()/len(x)*100) # calc non-missing for each feature
non_missing_df['average_count']=non_missing_df.mean(axis=1)

#non-missing df excluding ESG features with missing data - refer to sum stats GLOBAL csv file
''' 421 ESG features '''

'''Referred excel file Global sum stats '''
ESG_sum_stats = pd.read_csv('GLOBAL_ESG_sum_stats.csv')

'''Only 30 features have <5% missing values'''
ESG_30_cols = list(ESG_sum_stats.iloc[:,1:30].columns)

data_cols = list(data.iloc[:,26:183].columns) # data columns
cols = data_cols + ESG_30_cols # combine ESG cols 30 features with other data in list
dropped_ESG_features = pd.DataFrame() # empty df
dropped_ESG_features[cols]=data.groupby('loc')[cols]\
.apply(lambda x: x.notnull().sum()/len(x)*100) # calc non-missing for each feature
dropped_ESG_features['average_count']=dropped_ESG_features.mean(axis=1)
In [121]:
'''Plot non-missing average count for each country'''
plot = non_missing_df['average_count'].reset_index()
plot.sort_values(by='average_count', ascending=False, inplace=True)    
plt.figure(figsize=(16,8))
plt.bar(plot['loc'],plot['average_count'])
plt.ylabel('% non-missing values')
plt.title('Non-missing values by country')
plt.yticks(np.arange(10, 110, 10))
plt.xlabel('Country')
Out[121]:
Text(0.5, 0, 'Country')
In [128]:
'''Large majority of the missing values are from ESG features not having any data'''

'''Plot non-missing average count for each country'''
plot = dropped_ESG_features['average_count'].reset_index()
plot.sort_values(by='average_count', ascending=False, inplace=True)    
plt.figure(figsize=(16,8))
plt.bar(plot['loc'],plot['average_count'])
plt.ylabel('% non-missing values')
plt.title('Non-missing values by country with ESG features < 5% missing data')
plt.yticks(np.arange(10, 110, 10))
plt.xlabel('Country')
Out[128]:
Text(0.5, 0, 'Country')
In [152]:
"Function to plot the missing values"

def missing_value_plot(df,title):
    null_counts = df.isnull().sum()/len(df)
    null_counts = null_counts.sort_values()
    plt.figure(figsize=(16,8))
    plt.xticks(np.arange(len(null_counts))+0.5,null_counts.index,rotation='vertical')
    plt.ylabel('Fraction of rows with missing data (%)')
    plt.bar(np.arange(len(null_counts)),null_counts)
    plt.title(title)
    plt.xlabel('Features')
In [153]:
missing_value_plot(data.iloc[:,31:89], 'Financial ratios') # financial ratios missing data + 3 categorical industry 
'''note: 
capeiq approx 18.75% - needs 5y of rolling data for each firm - some firms don't have this in the data
cash_conversion, inv_turn, inv_actq all have a numerator or denominator = 0. Those missing values are simply 
undefined values. What do we do about these?'''
Out[153]:
"note: \ncapeiq approx 18.75% - needs 5y of rolling data for each firm - some firms don't have this in the data\ncash_conversion, inv_turn, inv_actq all have a numerator or denominator = 0. Those missing values are simply \nundefined values. What do we do about these?"
In [138]:
data.rename(columns={'Recommendation - Mean (1-5)':'Rec_Mean',
                        'Recommendation - Number Of Total':'Rec_Total',
                        'Recommendation - Median (1-5)':'Rec_Median',
                        'Recommendation - Low (1-5)':'Rec_Low',
                        'Recommendation - High (1-5)':'Rec_High',
                        'Recommendation - Number Of Strong Buy':'Rec_SBuy',
                        'Recommendation - Number Of Buy': 'Rec_Buy',
                        'Recommendation - Number Of Hold': 'Rec_Hold',
                        'Recommendation - Number Of Sell': 'Rec_Sell',
                        'Recommendation - Number Of Strong Sell': 'Rec_SSell',
                        'Recommendation - Number Of No Opinion':'Rec_NoOpinion',
                        'Long Term Growth - Mean':'LTG_Mean',
                        'Number of Analysts':'Num_Analyst'}, inplace=True)
In [154]:
missing_value_plot(data[['Rec_Total','Rec_Median','Rec_Low','Rec_High','Rec_SBuy','Rec_Buy','Rec_Hold',
 'Rec_Sell','Rec_SSell','Rec_NoOpinion','LTG_Mean','Num_Analyst']], 'Analyst recommendations')

'''NOTE: Once a firm has a recommendation, all the columns have values. This missing data is prior to the 
firm having any recommendations. What should we do about this?'''
Out[154]:
'NOTE: Once a firm has a recommendation, all the columns have values. This missing data is prior to the \nfirm having any recommendations. What should we do about this?'
In [155]:
macro_features = ['balance_on_trade_bop', 'business_confidence', 'cab%gdp', 'central_government_deficit', 
                          'consumer_confidence', 'consumer_spending', 'current_account_balance', 
                          'export_goods_bop', 'export_prices', 'exports_goods_services', 'foreign_trade_balance', 
                          'fx_USD', 'government_bond', 'government_external_debt', 'government_spending', 
                          'gross_fixed_capital_investment', 'import_prices', 'imports_goods_bop', 
                          'imports_goods_services', 'industrial_production', 'international_reserves',
                          'labour_force_survey', 'merchandise_exports', 'merchandise_imports', 'money_supply_M0', 
                          'money_supply_M1', 'money_supply_M2', 'money_supply_M3', 'overnight_rate', 'population', 
                          'retail_sales', 'sov_outlook_rank','stock_index', 'terms_of_trade', 
                          'unemployment_rate']

missing_value_plot(data[macro_features], 'Macro')

'''Used Eikon and St Louis and missing data is data that I cannot retrieve at all for a country from the data
sources'''
Out[155]:
'Used Eikon and St Louis and missing data is data that I cannot retrieve at all for a country from the data\nsources'
In [156]:
'''Same missing as macro'''
macro_features_growth = [col for col in data if col.endswith('%')]
missing_value_plot(data[macro_features_growth], 'Macro growth')
In [157]:
missing_value_plot(data[ESG_30_cols], 'ESG features <5% missing values') # financial ratios missing data + 3 categorical industry 
In [145]:
print(macro_features)
['balance_on_trade_bop', 'business_confidence', 'cab%gdp', 'central_government_deficit', 'consumer_confidence', 'consumer_spending', 'country', 'current_account_balance', 'export_goods_bop', 'export_prices', 'exports_goods_services', 'foreign_trade_balance', 'fx_USD', 'government_bond', 'government_external_debt', 'government_spending', 'gross_fixed_capital_investment', 'import_prices', 'imports_goods_bop', 'imports_goods_services', 'industrial_production', 'international_reserves', 'labour_force_survey', 'merchandise_exports', 'merchandise_imports', 'money_supply_M0', 'money_supply_M1', 'money_supply_M2', 'money_supply_M3', 'overnight_rate', 'population', 'rating_rank', 'retail_sales', 'sov_outlook_rank', 'sov_rating_rank', 'stock_index', 'terms_of_trade', 'total_external_debt', 'unemployment_rate']
In [149]:
'''Box plot of numeric macro features'''

fig, ax = plt.subplots(9, 4, figsize=(40, 30))
for var, subplot in zip(macro_features, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [88]:
'''Box plot of financial ratio features - Capitalization'''

fin_capitalization=['capital_ratioq']

fig, ax = plt.subplots(1,2, figsize=(20, 10))
for var, subplot in zip(fin_capitalization, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [89]:
'''Box plot of financial ratio features - Efficiency'''

fin_efficiency=['at_turnq','inv_turnq','pay_turnq','rect_turnq','sale_equityq','sale_nwcq']

fig, ax = plt.subplots(3,2, figsize=(30, 30))
for var, subplot in zip(fin_efficiency, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [90]:
'''Box plot of financial ratio features - Financial Soundness'''

fin_finsoundness=['invt_actq','rect_actq','fcf_ocfq','ocf_lctq','cash_debtq',
                'cash_ltq','cfmq','short_debtq','profit_lctq',
                'curr_debtq','debt_ebitdaq','dltt_beq','int_debtq',
                'int_totdebtq','lt_debtq','lt_ppentq']

fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_finsoundness, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [91]:
'''Box plot of financial ratio features - Liquidity'''

fin_liquidity=['cash_ratioq','curr_ratioq','quick_ratioq','accrualq']

fig, ax = plt.subplots(2,2, figsize=(30, 30))
for var, subplot in zip(fin_liquidity, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [92]:
'''Box plot of financial ratio features - Profitability''' #NO FINANCIALS

fin_profitability = ['efftaxq','GProfq','aftret_eqq','aftret_equityq','gpmq','npmq','opmadq',
'opmbdq','pretret_earnatq','pretret_noq','ptpmq','roaq','roceq','roeq']

fig, ax = plt.subplots(7,2, figsize=(30, 30))
for var, subplot in zip(fin_profitability, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [93]:
'''Box plot of financial ratio features - Solvency'''

fin_solvency = ['de_ratioq','debt_atq','lt_atq','debt_capitalq','intcovq','intcov_ratioq']

fig, ax = plt.subplots(3,2, figsize=(30, 30))
for var, subplot in zip(fin_solvency, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [94]:
'''Box plot of financial ratio features - Valuation'''

fin_valuation = ['dprq','bmq','capeiq','evmq','pcfq','psq','ptbq']

fig, ax = plt.subplots(4,2, figsize=(30, 30))
for var, subplot in zip(fin_valuation, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [95]:
'''Box plot of financial ratio features - other'''

fin = ['USD_mktcap','USD_price_adj']

fig, ax = plt.subplots(1,3, figsize=(30, 10))
for var, subplot in zip(fin, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [110]:
''''no more than 5% missing data is in 30 of the features for global sample'''
ESG_30_cols = list(ESG_sum_stats.iloc[:,1:30].columns)
print(ESG_30_cols)
['Anti Takeover Devices Above Two', 'Board Size More Ten Less Eight', 'ESG Score', 'Management Score', 'Shareholders Score', 'CSR Strategy Score', 'ESG Combined Score', 'Governance Pillar Score', 'Board Size', 'Announced Layoffs To Total Employees', 'Resource Use Score', 'Emissions Score', 'Workforce Score', 'Human Rights Score', 'Community Score', 'Product Responsibility Score', 'ESG Controversies Score', 'Social Pillar Score', 'Environment Pillar Score', 'Executive Members Gender Diversity, Percent', 'Non-Executive Board Members', 'Board Gender Diversity, Percent', 'Environmental Innovation Score', 'Board Member Affiliations', 'Independent Board Members', 'Audit Committee Independence', 'Audit Committee NonExecutive Members', 'Net Employment Creation', 'Average Board Tenure']
In [115]:
fig, ax = plt.subplots(10,3, figsize=(30, 40))
for var, subplot in zip(ESG_30_cols, ax.flatten()):
    sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
In [150]:
# get the distribution of a feature per class with Yeo-Johnson transformation

from scipy.stats import anderson
from scipy.stats import kstest
#from scipy.stats import zscore
from scipy.stats import yeojohnson

import scipy.stats as st

'''Interested in each distribution by each class'''
def get_best_distribution_groupby(data,rating):
    dist_names = ["norm", "lognorm","exponweib", "weibull_max", 
                  "weibull_min", "pareto", "genextreme","logistic"]
    dist_results = []
    params = {}
    for dist_name in dist_names:
        dist = getattr(st, dist_name)
        param = dist.fit(data)

        params[dist_name] = param
        # Applying the Kolmogorov-Smirnov test
        D, p = kstest(data, dist_name, args=param)
        dist_results.append((dist_name, p))

    # select the best fitted distribution
    best_dist, best_p = (max(dist_results, key=lambda item: item[1]))
    # store the name of the best fit and its p value

    print("Best fitting distribution " +str(rating)+": " +str(best_dist))
    return best_dist

def dist_by_class(class_col, feature_col, df):
    df = df[[class_col,feature_col]]
    unique_ir = df[class_col].unique()
    #rating_zscores = []
    rating_yeojohnson = []
    rating_dist = []
    for rating in unique_ir:
        data = df[df[class_col]==rating]
        #z = zscore(data[feature_col].dropna())
        xt, lmbda = yeojohnson(np.array(data[feature_col].dropna()))
        if len(xt) == 0:
            print('No obs')
        else:
            #rating_zscores.append(xt)
            rating_yeojohnson.append(xt)
            print(str(len(xt)) + " length without outliers")
            """For anderson:
            dist must be 'norm', 'expon', 'gumbel', 'extreme1' or 'logistic'."""
            and_darl = anderson(xt, 'norm') #and_darl
            #print("and_darl_test:  " + str(and_darl))   
            best_dist = get_best_distribution_groupby(xt,rating) #kstest
            rating_dist.append(best_dist)
            
    fig, ax = plt.subplots(5,5, 
                          figsize=(30,30))
    for rating,rating_dist,unique_rating,subplot in zip(rating_yeojohnson,rating_dist,
                                                        list(unique_ir), ax.flatten()):
        sns.distplot(rating, ax=subplot).set_title(str(unique_rating)+' '+str(rating_dist))
In [151]:
dist_by_class('Issuer Rating', 'ESG Combined Score', data)
102 length without outliers
/Users/lockiemichalski/opt/anaconda3/lib/python3.8/site-packages/scipy/stats/_continuous_distns.py:1677: RuntimeWarning: invalid value encountered in add
  logp = (np.log(a) + np.log(c) + sc.xlogy(a - 1.0, exm1c) +
/Users/lockiemichalski/opt/anaconda3/lib/python3.8/site-packages/scipy/stats/_distn_infrastructure.py:2429: RuntimeWarning: invalid value encountered in double_scalars
  Lhat = muhat - Shat*mu
Best fitting distribution AAA: norm
94 length without outliers
Best fitting distribution AA+: genextreme
258 length without outliers
Best fitting distribution AA: weibull_min
651 length without outliers
/Users/lockiemichalski/opt/anaconda3/lib/python3.8/site-packages/scipy/stats/_continuous_distns.py:2666: RuntimeWarning: invalid value encountered in subtract
  -pex2+logpex2-logex2)
Best fitting distribution AA-: norm
1149 length without outliers
Best fitting distribution A+: norm
1048 length without outliers
Best fitting distribution A: norm
2578 length without outliers
Best fitting distribution A-: logistic
3766 length without outliers
Best fitting distribution BBB+: genextreme
3269 length without outliers
Best fitting distribution BBB: genextreme
2894 length without outliers
Best fitting distribution BBB-: norm
1483 length without outliers
Best fitting distribution BB+: exponweib
1209 length without outliers
Best fitting distribution BB: weibull_max
760 length without outliers
Best fitting distribution BB-: exponweib
546 length without outliers
Best fitting distribution B+: exponweib
405 length without outliers
Best fitting distribution B: exponweib
245 length without outliers
Best fitting distribution B-: logistic
109 length without outliers
Best fitting distribution CCC+: logistic
54 length without outliers
Best fitting distribution CCC: genextreme
20 length without outliers
Best fitting distribution CCC-: norm
32 length without outliers
Best fitting distribution CC: logistic
18 length without outliers
Best fitting distribution SD: exponweib
18 length without outliers
Best fitting distribution D: logistic
In [ ]: